library(beeswarm)
library(naniar)
library(zoo)

Attaching package: ‘zoo’

The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric
# install.packages("zoo")
library(janitor)

Attaching package: ‘janitor’

The following objects are masked from ‘package:stats’:

    chisq.test, fisher.test
library(dplyr)

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union
# install.packages("GGally")
# library(sets)
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
  method         from
  print.tbl_lazy     
  print.tbl_sql      
── Attaching packages ──────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
✓ ggplot2 3.3.3     ✓ purrr   0.3.4
✓ tibble  3.0.6     ✓ stringr 1.4.0
✓ tidyr   1.1.2     ✓ forcats 0.5.1
✓ readr   1.4.0     
── Conflicts ─────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
library(ggplot2)
library(GGally) # for ggpairs
Registered S3 method overwritten by 'GGally':
  method from   
  +.gg   ggplot2
library(lubridate)

Attaching package: ‘lubridate’

The following objects are masked from ‘package:base’:

    date, intersect, setdiff, union
# install.packages("maps")
# library(maps)
load_file <- function(file_path){
  read_csv(file_path)
}

tx_data <- load_file("./../data/COVID-19_cases_TX.csv")

── Column specification ─────────────────────────────────────────────────────────────────────────────────────────
cols(
  county_fips_code = col_character(),
  county_name = col_character(),
  state = col_character(),
  state_fips_code = col_double(),
  date = col_date(format = ""),
  confirmed_cases = col_double(),
  deaths = col_double()
)
global_mobility_report <- load_file("./../data/Global_Mobility_Report.csv")

── Column specification ─────────────────────────────────────────────────────────────────────────────────────────
cols(
  country_region_code = col_character(),
  country_region = col_character(),
  sub_region_1 = col_character(),
  sub_region_2 = col_logical(),
  metro_area = col_logical(),
  iso_3166_2_code = col_character(),
  census_fips_code = col_logical(),
  date = col_date(format = ""),
  retail_and_recreation_percent_change_from_baseline = col_double(),
  grocery_and_pharmacy_percent_change_from_baseline = col_double(),
  parks_percent_change_from_baseline = col_double(),
  transit_stations_percent_change_from_baseline = col_double(),
  workplaces_percent_change_from_baseline = col_double(),
  residential_percent_change_from_baseline = col_double()
)

4199216 parsing failures.
 row        col           expected                  actual                                   file
3036 metro_area 1/0/T/F/TRUE/FALSE Kabul Metropolitan Area './../data/Global_Mobility_Report.csv'
3037 metro_area 1/0/T/F/TRUE/FALSE Kabul Metropolitan Area './../data/Global_Mobility_Report.csv'
3038 metro_area 1/0/T/F/TRUE/FALSE Kabul Metropolitan Area './../data/Global_Mobility_Report.csv'
3039 metro_area 1/0/T/F/TRUE/FALSE Kabul Metropolitan Area './../data/Global_Mobility_Report.csv'
3040 metro_area 1/0/T/F/TRUE/FALSE Kabul Metropolitan Area './../data/Global_Mobility_Report.csv'
.... .......... .................. ....................... ......................................
See problems(...) for more details.
cases_plus_census <- load_file("./../data/COVID-19_cases_plus_census.csv")

── Column specification ─────────────────────────────────────────────────────────────────────────────────────────
cols(
  .default = col_double(),
  county_fips_code = col_character(),
  county_name = col_character(),
  state = col_character(),
  state_fips_code = col_character(),
  date = col_date(format = ""),
  geo_id = col_character(),
  pop_5_years_over = col_logical(),
  speak_only_english_at_home = col_logical(),
  speak_spanish_at_home = col_logical(),
  speak_spanish_at_home_low_english = col_logical(),
  pop_15_and_over = col_logical(),
  pop_never_married = col_logical(),
  pop_now_married = col_logical(),
  pop_separated = col_logical(),
  pop_widowed = col_logical(),
  pop_divorced = col_logical()
)
ℹ Use `spec()` for the full column specifications.
cols_keep <- c("county_fips_code", "confirmed_cases", "deaths", "median_income", "male_pop", "female_pop", "total_pop", "median_age", "worked_at_home")
subset_census <- cases_plus_census[cols_keep]

# cols_keep <- c("date", "retail_and_recreation_percent_change_from_baseline", "grocery_and_pharmacy_percent_change_from_baseline", "parks_percent_change_from_baseline", "transit_stations_percent_change_from_baseline", "workplaces_percent_change_from_baseline", "residential_percent_change_from_baseline")
# subset_mobility <- global_mobility_report[cols_keep]
# glo
# subset_mobility$date <- as.Date(subset_mobility$date, format="%Y-%m-%d")
# global_mobility_report
vis_miss(global_mobility_report, sort_miss = T, warn_large_data= F)

vis_miss(tx_data, sort_miss = T, warn_large_data= F)

vis_miss(subset_census, sort_miss = T, warn_large_data = F)

library(RColorBrewer)
plot_vs_county <- function(df, col_val, percentile=FALSE,
                           fips_title="county_fips_code", banks=6, 
                           legend_title="", graphic_title=""){
  # Subset for speed 
  df <- df[c(fips_title, col_val)]
  
  # Get county data
  gcounty <- ggplot2::map_data("county")
  # USA map data
  gusa <- map_data("state")
  
  if (banks > 9){
    mycolors <- colorRampPalette(brewer.pal(9, "Reds"))(banks)
  }
  
  # Format with subregions
  fipstab <-
      transmute(maps::county.fips, fips, county = sub(":.*", "", polyname)) %>%
      unique() %>%
      separate(county, c("region", "subregion"), sep = ",")
  
  # Combine in desired order (NA for missing)
  gcounty <- left_join(gcounty, fipstab, c("region", "subregion"))


  dis <- df
  dis$rprop <- rank(df[col_val])
  dis$pcls <- cut(100 * percent_rank(df[col_val]), seq(0, 100, len = banks),
                        include.lowest = TRUE)

  # Missing data
  anti_join(gcounty, dis, by = c("fips" = fips_title)) %>%
    select(region, subregion) %>%
    unique()
  gcounty_pop <- left_join(gcounty, dis, by = c("fips" = fips_title))
  fill_vals <- gcounty_pop[col_val]

  # Plot
  if (legend_title == ""){
    legend_title <- col_val
  }

  if (percentile == FALSE){
    # names(gcounty_pop)[names(gcounty_pop) == col_val] <- "col_of_interest"
    plt <- ggplot(gcounty_pop) +
      geom_polygon(aes(long, lat, group = group, fill = get(col_val)),
                   color = "grey", size = 0.1, name="Percent Infected") +
      geom_polygon(aes(long, lat, group = group),
                   fill = NA, data = gusa, color = "lightgrey") +
      coord_map("bonne", parameters = 41.6) + ggthemes::theme_map()+
      scale_fill_gradient2()
       # scale_fill_gradient(low = "white", high = "red", na.value = "grey")
      # scale_fill_gradientn(colours = terrain.colors(10))
  }

  if (percentile == TRUE){
    plt <- ggplot(gcounty_pop) +
      geom_polygon(aes(long, lat, group = group, fill = pcls),
                   color = "grey", size = 0.1) +
      geom_polygon(aes(long, lat, group = group),
                   fill = NA, data = gusa, color = "lightgrey") +
      coord_map("bonne", parameters = 41.6) + ggthemes::theme_map() +
      scale_fill_manual(values = mycolors, na.value = "grey") +
      # scale_fill_brewer(palette = "viridis", na.value = "grey") +
      theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)),
            legend.background = element_rect(fill = NA), 
            legend.position = "left")
  }
  plt <- plt + labs(fill=legend_title) + ggtitle(graphic_title)
  plt
}
subset_census
subset_census['pct_infected'] <- subset_census['confirmed_cases']/subset_census['total_pop']
subset_census['pct_deaths'] <- subset_census['deaths']/subset_census['total_pop']
subset_census$death_rate <- subset_census$deaths/subset_census$confirmed_cases
subset_census$county_fips_code <-as.integer(subset_census$county_fips_code)
subset_census
plot_vs_county(subset_census, "pct_infected", legend_title = "Percent Infected")
Ignoring unknown parameters: name

plot_vs_county(subset_census, "pct_infected", percentile = TRUE, banks=11, 
               legend_title = "Percentile Infected",
               graphic_title = "Percentile of Percentage of People Infected by County")

plot_vs_county(subset_census, "pct_deaths", percentile = TRUE, banks=11, 
               legend_title = "Percentile Deaths",
               graphic_title = "Percentile of Percentage of Deaths by County")

plot_vs_county(subset_census, "death_rate", percentile = TRUE, banks=11, 
               legend_title = "Percentile Mortality Rate",
               graphic_title = "Percentile of Mortality Rate by County")

plot_vs_county(subset_census, "death_rate", 
               legend_title = "Percentile Mortality Rate",
               graphic_title = "Percentile of Mortality Rate by County")
Ignoring unknown parameters: name

census_corr_cols <- c("deaths", "confirmed_cases", "median_income", "male_pop",
                      "female_pop", "total_pop", "median_age", "worked_at_home")
ggcorr(subset_census[census_corr_cols], low="red", mid="grey", high="blue", hjust= .75, size=3, 
       label = TRUE, label_size = 3, label_color = "white") + ggplot2::labs(title = "Pearson Correlation of Important Variables")

global_mobility_report
country_date_pct_change <- global_mobility_report %>% select(country_region_code
                                                             | contains("date") 
                                                             | contains("percent"))
country_date_pct_change
coi_downsampled <- country_date_pct_change %>% filter(country_region_code %in% 
                                                        c("US", "CA", "NZ", "AE", "CN", "DE", "JP")) %>% 
  filter(weekdays(date) == "Monday") %>% group_by(country_region_code, date) %>% summarise_all(mean, na.rm = T) %>% arrange(country_region_code, date)
coi_downsampled
interested_cols <- c("retail_and_recreation_percent_change_from_baseline",
                     "grocery_and_pharmacy_percent_change_from_baseline",
                     "parks_percent_change_from_baseline",
                     "transit_stations_percent_change_from_baseline",
                     "workplaces_percent_change_from_baseline",
                     "residential_percent_change_from_baseline")

col_labels <- c("Average Retail and Recreation Percent Change from Baseline",
                     "Average Grocery and Pharmacy Percent Change from Baseline",
                     "Average Parks Percent Change from Baseline",
                     "Average Transit Stations Percent Change from Baseline",
                     "Average Workplaces Percent Change from Baseline",
                     "Average Residential Percent Change from Baseline")


for (i in 1:length(col_labels)){
  print(i)
  plt <- ggplot(coi_downsampled,
       aes(x=date, y=get(interested_cols[i]), group=country_region_code,
                            color=country_region_code))+
  geom_point(aes(y=rollmean(get(interested_cols[i]), k=10, na.pad=TRUE)), size=.5)+
  geom_line(aes(y=rollmean(get(interested_cols[i]), k=10, na.pad=TRUE)))+
  # geom_point(size=.5)+geom_line()+
  labs(y = col_labels[i], x = "Date", color = "Country")+
  theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)),
        axis.title.y = element_text(size = 8))
  save_path <- paste(c("./../imgs/", interested_cols[i], ".png"), collapse = "")
  ggsave(save_path)
  print(plt)
}
[1] 1
Saving 7 x 7 in image
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6

# ggplot(coi_downsampled,
#        aes(x=date, y=grocery_and_pharmacy_percent_change_from_baseline, group=country_region_code,
#                             color=country_region_code))+
#   geom_point(aes(y=rollmean(retail_and_recreation_percent_change_from_baseline, k=10, na.pad=TRUE)), size=.5)+
#   geom_line(aes(y=rollmean(retail_and_recreation_percent_change_from_baseline, k=10, na.pad=TRUE)))+
#   # geom_point(size=.5)+geom_line()+
#   labs(y = "Average Grocery and Pharmacy Percent Change from Baseline", x = "Date", color = "Country")+
#   theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)),
#         axis.title.y = element_text(size = 8))
#   # geom_line()
tx_county_data <- tx_data %>% filter(county_name != "Statewide Unallocated")
tx_total_state_data <- tx_data %>% filter(county_name == "Statewide Unallocated")
tx_total_state_data$cumulative_deaths <- cumsum(tx_total_state_data$deaths)
tx_total_state_data$cumulative_cases <- cumsum(tx_total_state_data$confirmed_cases)
tx_county_data
tx_total_state_data
interested_cols <- c("confirmed_cases", "deaths", "cumulative_deaths", "cumulative_cases")
col_labels <- c("Confirmed Cases Per Day", "Deaths Per Day", "Total Deaths", "Total Cases")

for (i in 1:length(col_labels)){
  print(i)
  plt <- ggplot(tx_total_state_data,
       aes(x=date, y=get(interested_cols[i])))+
  # geom_point(aes(y=rollmean(get(interested_cols[i]), k=10, na.pad=TRUE)), size=.5)+
  # geom_line(aes(y=rollmean(get(interested_cols[i]), k=10, na.pad=TRUE)))+
  # geom_point(size=.5)+geom_line()+
    geom_line()+
  labs(y = col_labels[i], x = "Date")
  # theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)),
  #       axis.title.y = element_text(size = 8))
  # save_path <- paste(c("./../imgs/", interested_cols[i], ".png"), collapse = "")
  # ggsave(save_path)
  print(plt)
}
[1] 1
[1] 2
[1] 3
[1] 4

Since the above doesn’t really make sense (only 78 confirmed cases with over a thousand deaths), I am going to analyze on a per county basis and maybe that data will be more clear.

tx_by_day_based_on_county <- tx_county_data %>% 
  select(date, confirmed_cases, deaths) %>%
  group_by(date) %>% 
  summarise_all(sum, na.rm = T) %>%
  arrange(date)

interested_cols <- c("confirmed_cases", "deaths")
col_labels <- c("Total Cases", "Total Deaths")

for (i in 1:length(col_labels)){
  print(i)
  plt <- ggplot(tx_by_day_based_on_county,
       aes(x=date, y=get(interested_cols[i])))+
  # geom_point(aes(y=rollmean(get(interested_cols[i]), k=10, na.pad=TRUE)), size=.5)+
  # geom_line(aes(y=rollmean(get(interested_cols[i]), k=10, na.pad=TRUE)))+
  geom_point(size=.5)+geom_line()+
    # geom_line()+
  labs(y = col_labels[i], x = "Date")
  # theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)),
  #       axis.title.y = element_text(size = 8))
  # save_path <- paste(c("./../imgs/", interested_cols[i], ".png"), collapse = "")
  # ggsave(save_path)
  print(plt)
}
[1] 1
[1] 2

tx_by_day_based_on_county
library(scales)
# tx_by_day_based_on_county %>% pivot_longer(!date, names_to = "type", values_to = "count")
ggplot(tx_by_day_based_on_county %>% pivot_longer(!date, names_to = "type", values_to = "count"),
       aes(x=date, y=count, group=type, color = type))+
         geom_line()+
  scale_y_continuous(trans = "log10",
                     labels = trans_breaks('log10', math_format(10^.x)))+
  scale_color_manual(labels = c("Confirmed Cases", "Deaths"), values = c("confirmed_cases"="blue",
                                                                         "deaths"="red"))+
  labs(y = "Total Persons", x = "Date", color = "")+
  theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)))+
  ggtitle("Texas COVID-19 Cases")
  ggsave("./../imgs/texas_covid_cases_total.png")
Saving 7.29 x 4.51 in image

  # scale_y_log10()
         # labs(y = "Total Persons", x = "Date"))

# )
# 
# plt <- ggplot(coi_downsampled,
#        aes(x=date, y=get(interested_cols[i]), group=country_region_code,
#                             color=country_region_code))+
#   geom_point(aes(y=rollmean(get(interested_cols[i]), k=10, na.pad=TRUE)), size=.5)+
#   geom_line(aes(y=rollmean(get(interested_cols[i]), k=10, na.pad=TRUE)))+
#   # geom_point(size=.5)+geom_line()+
#   labs(y = col_labels[i], x = "Date", color = "Country")+
#   theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)),
#         axis.title.y = element_text(size = 8))
#   save_path <- paste(c("./../imgs/", interested_cols[i], ".png"), collapse = "")
#   ggsave(save_path)
#   print(plt)
tx_county_data
subset_census

tx_state <- map_data("state") %>% subset(region == "texas")
tx_county_map_data <- map_data("county") %>% subset(region == "texas")

tx_state
tx_county_map_data

gcounty <- map_data("county")
# 
# fipstab <-
#       transmute(maps::county.fips, fips, county = sub(":.*", "", polyname)) %>%
#       unique() %>%
#       separate(county, c("region", "subregion"), sep = ",")

# Format with subregions
fipstab <-
  transmute(maps::county.fips, fips, county = sub(":.*", "", polyname)) %>%
  unique() %>%
  separate(county, c("region", "subregion"), sep = ",")
  
  # # Combine in desired order (NA for missing)
  # gcounty <- left_join(gcounty, fipstab, c("region", "subregion"))


tx_geo_data <- left_join(gcounty, fipstab, c("region", "subregion")) %>%
  left_join(subset_census, c("fips" = "county_fips_code")) %>% filter(region == "texas") %>%
  unique()

tx_geo_data$dinfect_pcls <- cut(100 * percent_rank(tx_geo_data$pct_infected), seq(0, 100, len = 11),
                        include.lowest = TRUE)
tx_geo_data$deaths_pcls <- cut(100 * percent_rank(tx_geo_data$pct_deaths), seq(0, 100, len = 11),
                        include.lowest = TRUE)

tx_geo_data



# 
# # Lowercase
# tx_county_data$county_name <- tolower(tx_county_data$county_name)
# 
# # Remove ' county'
# tx_county_data$county_name <- sub("\\s*county\\b.*", "", tx_county_data$county_name)
# tx_county_data
# 
# # Get max confirmed cases and deaths
# tx_county_data
# 
# 
# # # Join the data with state geo info
# # tx_geo_data <- left_join(tx_county_map_data, tx_county_data, by = c("subregion" = "county_name"))
# # tx_geo_data

mycolors <- colorRampPalette(brewer.pal(9, "Reds"))(11)

ggplot(tx_geo_data)+
  coord_map() + ggthemes::theme_map()+
  geom_polygon(aes(long, lat, group = group, fill=deaths))+
  # scale_fill_manual(values = mycolors, na.value = "grey") +
  # theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)),
  #       legend.background = element_rect(fill = NA), 
  #       legend.position = "left")+
  labs(fill="Percentil") + ggtitle("graphic_title")




# plt <- ggplot(gcounty_pop) +
#       geom_polygon(aes(long, lat, group = group, fill = dinfect_pcls),
#                    color = "grey", size = 0.1) +
#       geom_polygon(aes(long, lat, group = group),
#                    fill = NA, data = gusa, color = "lightgrey") +
#       coord_map("bonne", parameters = 41.6) + ggthemes::theme_map() +
#       scale_fill_manual(values = mycolors, na.value = "grey") +
#       # scale_fill_brewer(palette = "viridis", na.value = "grey") +
#       theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)),
#             legend.background = element_rect(fill = NA), 
#             legend.position = "left")

# ggplot(tx_geo_data) +
#          coord_map() + 
#   ggthemes::theme_map()+
#   geom_polygon(aes(long, lat, group = subregion, fill = confirmed_cases))
# pc_cont_iowa <- geom_polygon(aes(long, lat, group = group, fill = pchange),
#                              color = "grey", size = 0.2)
# dallas_data <- tx_county_data %>% filter(county_name == "dallas") 
dallas_data <- tx_county_data %>% filter(county_name == "dallas") %>% 
  select(date, confirmed_cases, deaths)
Error in filter(., county_name == "dallas") : 
  object 'tx_county_data' not found
sgcounty <- map_data("county")

# Format with subregions
fipstab <-
  transmute(maps::county.fips, fips, county = sub(":.*", "", polyname)) %>%
  unique() %>%
  separate(county, c("region", "subregion"), sep = ",")

us_geo_data <- left_join(gcounty, fipstab, c("region", "subregion")) %>%
  left_join(subset_census, c("fips" = "county_fips_code")) %>%
  unique() %>% select(long, lat, region, subregion, group, confirmed_cases, deaths, median_income, 
                      male_pop, female_pop, total_pop, median_age,
                      worked_at_home)
# us_geo_data

# Get the total cases by state
state_data_breakdown <- us_geo_data %>% 
  drop_na() %>% select(region, subregion, confirmed_cases, deaths, median_income, 
                      male_pop, female_pop, total_pop, median_age,
                      worked_at_home) %>% unique() %>%
  group_by(region) %>% select_if(is.numeric) %>%
  summarise_all(sum)

state_data_breakdown$pct_deaths <- state_data_breakdown$deaths/state_data_breakdown$total_pop
state_data_breakdown$pct_infect <- state_data_breakdown$confirmed_cases/state_data_breakdown$total_pop
state_data_breakdown$death_rate <- state_data_breakdown$deaths/state_data_breakdown$confirmed_cases

print(state_data_breakdown %>% arrange(death_rate, decreasing=T))

state_geo_data <- us_geo_data %>% select(long, lat, region, group) %>% left_join(state_data_breakdown,
                                                        by = "region")

state_geo_data

# ggplot(tx_by_day_based_on_county %>% pivot_longer(!date, names_to = "type", values_to = "count"),
#        aes(x=date, y=count, group=type, color = type))+
#          geom_line()+
#   scale_y_continuous(trans = "log10",
#                      labels = trans_breaks('log10', math_format(10^.x)))+
#   scale_color_manual(labels = c("Confirmed Cases", "Deaths"), values = c("confirmed_cases"="blue",
#                                                                          "deaths"="red"))+
#   labs(y = "Total Persons", x = "Date", color = "")+
#   theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)))+
#   ggtitle("US COVID-19 Cases")
  # ggsave("./../imgs/texas_covid_cases_total.png")
# state_geo_data$state_group <- factor
state_geo_data$state_group <- factor(state_geo_data$region) 
state_geo_data$state_group <- as.numeric(state_geo_data$state_group)
state_geo_data
# cf <- coord_fixed()
# cf$default <- TRUE
# state_geo_data$state_group <- 
plt <- ggplot(state_geo_data)+
  # geom_polygon(data=subset(map_data("state")), aes(x=long, y=lat, group=group, fill=death_rate))+
  geom_polygon(aes(long, lat, group = group, fill=death_rate))+
  # scale_fill_manual(values = mycolors, na.value = "grey") +
  # theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)),
  #       legend.background = element_rect(fill = NA), 
  #       legend.position = "left")+
  
  # geom_polygon(aes(long, lat, group = group, fill = get(col_val)),
  #                  color = "grey", size = 0.1, name="Percent Infected") +
      geom_polygon(aes(long, lat, group = group),
                   fill = NA, data = map_data("state"), color = "lightgrey") +
      coord_map("bonne", parameters = 41.6) + ggthemes::theme_map()
  
  # labs(fill="Percentil") + ggtitle("graphic_title")+
  # coord_map("bonne", parameters = 41.6) + ggthemes::theme_map()
plt

# http://homepage.stat.uiowa.edu/~luke/classes/STAT4580-2020/maps.html
ggplot(state_geo_data)+
  # geom_polygon(data=subset(map_data("state")), aes(x=long, y=lat, group=group, fill=death_rate))+
  geom_polygon(aes(long, lat, group = group, fill=death_rate))+
  # scale_fill_manual(values = mycolors, na.value = "grey") +
  # theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)),
  #       legend.background = element_rect(fill = NA), 
  #       legend.position = "left")+
  
  # geom_polygon(aes(long, lat, group = group, fill = get(col_val)),
  #                  color = "grey", size = 0.1, name="Percent Infected") +
      geom_polygon(aes(long, lat, group = group),
                   fill = NA, data = map_data("state"), color = "lightgrey") +
      coord_map("bonne", parameters = 41.6) + ggthemes::theme_map()+
  
  labs(fill="Percentil") + ggtitle("graphic_title")+
  coord_map("bonne", parameters = 41.6) + ggthemes::theme_map()
state_geo_data
---
title: "Cleaned Notebook"
output: html_notebook
---

```{r}
library(beeswarm)
library(naniar)
library(zoo)
# install.packages("zoo")
library(janitor)
library(dplyr)
# install.packages("GGally")
# library(sets)
library(tidyverse)
library(ggplot2)
library(GGally) # for ggpairs
library(lubridate)
# install.packages("maps")
# library(maps)
```

```{r}
load_file <- function(file_path){
  read_csv(file_path)
}

tx_data <- load_file("./../data/COVID-19_cases_TX.csv")
global_mobility_report <- load_file("./../data/Global_Mobility_Report.csv")
cases_plus_census <- load_file("./../data/COVID-19_cases_plus_census.csv")
```
```{r}
cols_keep <- c("county_fips_code", "confirmed_cases", "deaths", "median_income", "male_pop", "female_pop", "total_pop", "median_age", "worked_at_home")
subset_census <- cases_plus_census[cols_keep]

# cols_keep <- c("date", "retail_and_recreation_percent_change_from_baseline", "grocery_and_pharmacy_percent_change_from_baseline", "parks_percent_change_from_baseline", "transit_stations_percent_change_from_baseline", "workplaces_percent_change_from_baseline", "residential_percent_change_from_baseline")
# subset_mobility <- global_mobility_report[cols_keep]
# glo
# subset_mobility$date <- as.Date(subset_mobility$date, format="%Y-%m-%d")
```

```{r}
# global_mobility_report
vis_miss(global_mobility_report, sort_miss = T, warn_large_data= F)
```


```{r}
vis_miss(tx_data, sort_miss = T, warn_large_data= F)
vis_miss(subset_census, sort_miss = T, warn_large_data = F)
```

```{r}
library(RColorBrewer)
plot_vs_county <- function(df, col_val, percentile=FALSE,
                           fips_title="county_fips_code", banks=6, 
                           legend_title="", graphic_title=""){
  # Subset for speed 
  df <- df[c(fips_title, col_val)]
  
  # Get county data
  gcounty <- ggplot2::map_data("county")
  # USA map data
  gusa <- map_data("state")
  
  if (banks > 9){
    mycolors <- colorRampPalette(brewer.pal(9, "Reds"))(banks)
  }
  
  # Format with subregions
  fipstab <-
      transmute(maps::county.fips, fips, county = sub(":.*", "", polyname)) %>%
      unique() %>%
      separate(county, c("region", "subregion"), sep = ",")
  
  # Combine in desired order (NA for missing)
  gcounty <- left_join(gcounty, fipstab, c("region", "subregion"))


  dis <- df
  dis$rprop <- rank(df[col_val])
  dis$pcls <- cut(100 * percent_rank(df[col_val]), seq(0, 100, len = banks),
                        include.lowest = TRUE)

  # Missing data
  anti_join(gcounty, dis, by = c("fips" = fips_title)) %>%
    select(region, subregion) %>%
    unique()
  gcounty_pop <- left_join(gcounty, dis, by = c("fips" = fips_title))
  fill_vals <- gcounty_pop[col_val]

  # Plot
  if (legend_title == ""){
    legend_title <- col_val
  }

  if (percentile == FALSE){
    # names(gcounty_pop)[names(gcounty_pop) == col_val] <- "col_of_interest"
    plt <- ggplot(gcounty_pop) +
      geom_polygon(aes(long, lat, group = group, fill = get(col_val)),
                   color = "grey", size = 0.1, name="Percent Infected") +
      geom_polygon(aes(long, lat, group = group),
                   fill = NA, data = gusa, color = "lightgrey") +
      coord_map("bonne", parameters = 41.6) + ggthemes::theme_map()+
      scale_fill_gradient2()
       # scale_fill_gradient(low = "white", high = "red", na.value = "grey")
      # scale_fill_gradientn(colours = terrain.colors(10))
  }

  if (percentile == TRUE){
    plt <- ggplot(gcounty_pop) +
      geom_polygon(aes(long, lat, group = group, fill = pcls),
                   color = "grey", size = 0.1) +
      geom_polygon(aes(long, lat, group = group),
                   fill = NA, data = gusa, color = "lightgrey") +
      coord_map("bonne", parameters = 41.6) + ggthemes::theme_map() +
      scale_fill_manual(values = mycolors, na.value = "grey") +
      # scale_fill_brewer(palette = "viridis", na.value = "grey") +
      theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)),
            legend.background = element_rect(fill = NA), 
            legend.position = "left")
  }
  plt <- plt + labs(fill=legend_title) + ggtitle(graphic_title)
  plt
}
```

```{r}
subset_census
```

```{r}
subset_census['pct_infected'] <- subset_census['confirmed_cases']/subset_census['total_pop']
subset_census['pct_deaths'] <- subset_census['deaths']/subset_census['total_pop']
subset_census$death_rate <- subset_census$deaths/subset_census$confirmed_cases
subset_census$county_fips_code <-as.integer(subset_census$county_fips_code)
subset_census
```
```{r}
plot_vs_county(subset_census, "pct_infected", legend_title = "Percent Infected")
plot_vs_county(subset_census, "pct_infected", percentile = TRUE, banks=11, 
               legend_title = "Percentile Infected",
               graphic_title = "Percentile of Percentage of People Infected by County")
plot_vs_county(subset_census, "pct_deaths", percentile = TRUE, banks=11, 
               legend_title = "Percentile Deaths",
               graphic_title = "Percentile of Percentage of Deaths by County")
plot_vs_county(subset_census, "death_rate", percentile = TRUE, banks=11, 
               legend_title = "Percentile Mortality Rate",
               graphic_title = "Percentile of Mortality Rate by County")
plot_vs_county(subset_census, "death_rate", 
               legend_title = "Percentile Mortality Rate",
               graphic_title = "Percentile of Mortality Rate by County")
```

```{r}
census_corr_cols <- c("deaths", "confirmed_cases", "median_income", "male_pop",
                      "female_pop", "total_pop", "median_age", "worked_at_home")
ggcorr(subset_census[census_corr_cols], low="red", mid="grey", high="blue", hjust= .75, size=3, 
       label = TRUE, label_size = 3, label_color = "white") + ggplot2::labs(title = "Pearson Correlation of Important Variables")
```
```{r}
global_mobility_report
```


```{r}
country_date_pct_change <- global_mobility_report %>% select(country_region_code
                                                             | contains("date") 
                                                             | contains("percent"))
country_date_pct_change
```
```{r}
coi_downsampled <- country_date_pct_change %>% filter(country_region_code %in% 
                                                        c("US", "CA", "NZ", "AE", "CN", "DE", "JP")) %>% 
  filter(weekdays(date) == "Monday") %>% group_by(country_region_code, date) %>% summarise_all(mean, na.rm = T) %>% arrange(country_region_code, date)
coi_downsampled
```



```{r}
interested_cols <- c("retail_and_recreation_percent_change_from_baseline",
                     "grocery_and_pharmacy_percent_change_from_baseline",
                     "parks_percent_change_from_baseline",
                     "transit_stations_percent_change_from_baseline",
                     "workplaces_percent_change_from_baseline",
                     "residential_percent_change_from_baseline")

col_labels <- c("Average Retail and Recreation Percent Change from Baseline",
                     "Average Grocery and Pharmacy Percent Change from Baseline",
                     "Average Parks Percent Change from Baseline",
                     "Average Transit Stations Percent Change from Baseline",
                     "Average Workplaces Percent Change from Baseline",
                     "Average Residential Percent Change from Baseline")


for (i in 1:length(col_labels)){
  print(i)
  plt <- ggplot(coi_downsampled,
       aes(x=date, y=get(interested_cols[i]), group=country_region_code,
                            color=country_region_code))+
  geom_point(aes(y=rollmean(get(interested_cols[i]), k=10, na.pad=TRUE)), size=.5)+
  geom_line(aes(y=rollmean(get(interested_cols[i]), k=10, na.pad=TRUE)))+
  # geom_point(size=.5)+geom_line()+
  labs(y = col_labels[i], x = "Date", color = "Country")+
  theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)),
        axis.title.y = element_text(size = 8))
  save_path <- paste(c("./../imgs/", interested_cols[i], ".png"), collapse = "")
  ggsave(save_path)
  print(plt)
}
# ggplot(coi_downsampled,
#        aes(x=date, y=grocery_and_pharmacy_percent_change_from_baseline, group=country_region_code,
#                             color=country_region_code))+
  # geom_point(aes(y=rollmean(retail_and_recreation_percent_change_from_baseline, k=10, na.pad=TRUE)), size=.5)+
#   geom_line(aes(y=rollmean(retail_and_recreation_percent_change_from_baseline, k=10, na.pad=TRUE)))+
#   # geom_point(size=.5)+geom_line()+
#   labs(y = "Average Grocery and Pharmacy Percent Change from Baseline", x = "Date", color = "Country")+
#   theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)),
#         axis.title.y = element_text(size = 8))
#   # geom_line()
```

```{r}
tx_county_data <- tx_data %>% filter(county_name != "Statewide Unallocated")
tx_total_state_data <- tx_data %>% filter(county_name == "Statewide Unallocated")
tx_total_state_data$cumulative_deaths <- cumsum(tx_total_state_data$deaths)
tx_total_state_data$cumulative_cases <- cumsum(tx_total_state_data$confirmed_cases)
tx_county_data
tx_total_state_data
```

```{r}

```

```{r}
interested_cols <- c("confirmed_cases", "deaths", "cumulative_deaths", "cumulative_cases")
col_labels <- c("Confirmed Cases Per Day", "Deaths Per Day", "Total Deaths", "Total Cases")

for (i in 1:length(col_labels)){
  print(i)
  plt <- ggplot(tx_total_state_data,
       aes(x=date, y=get(interested_cols[i])))+
  # geom_point(aes(y=rollmean(get(interested_cols[i]), k=10, na.pad=TRUE)), size=.5)+
  # geom_line(aes(y=rollmean(get(interested_cols[i]), k=10, na.pad=TRUE)))+
  # geom_point(size=.5)+geom_line()+
    geom_line()+
  labs(y = col_labels[i], x = "Date")
  # theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)),
  #       axis.title.y = element_text(size = 8))
  # save_path <- paste(c("./../imgs/", interested_cols[i], ".png"), collapse = "")
  # ggsave(save_path)
  print(plt)
}
```

Since the above doesn't really make sense (only 78 confirmed cases with over a thousand deaths), I am going to analyze on a per county basis and maybe that data will be more clear.

```{r}
tx_by_day_based_on_county <- tx_county_data %>% 
  select(date, confirmed_cases, deaths) %>%
  group_by(date) %>% 
  summarise_all(sum, na.rm = T) %>%
  arrange(date)

# interested_cols <- c("confirmed_cases", "deaths")
# col_labels <- c("Total Cases", "Total Deaths")
# 
# for (i in 1:length(col_labels)){
#   print(i)
#   title <- paste(c("Texas ", col_labels[i]), collapse = "")
#   plt <- ggplot(tx_by_day_based_on_county,
#        aes(x=date, y=get(interested_cols[i])))+
#   # geom_point(aes(y=rollmean(get(interested_cols[i]), k=10, na.pad=TRUE)), size=.5)+
#   # geom_line(aes(y=rollmean(get(interested_cols[i]), k=10, na.pad=TRUE)))+
#   # geom_point(size=.5)+geom_line()+
#     geom_line(color)+
#   labs(y = col_labels[i], x = "Date", title = title)
#   # theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)),
#   #       axis.title.y = element_text(size = 8))
#   # save_path <- paste(c("./../imgs/", interested_cols[i], ".png"), collapse = "")
#   # ggsave(save_path)
#   print(plt)
# }

```
```{r}
tx_by_day_based_on_county
```



```{r}
library(scales)
# tx_by_day_based_on_county %>% pivot_longer(!date, names_to = "type", values_to = "count")
ggplot(tx_by_day_based_on_county %>% pivot_longer(!date, names_to = "type", values_to = "count"),
       aes(x=date, y=count, group=type, color = type))+
         geom_line()+
  scale_y_continuous(trans = "log10",
                     labels = trans_breaks('log10', math_format(10^.x)))+
  scale_color_manual(labels = c("Confirmed Cases", "Deaths"), values = c("confirmed_cases"="blue",
                                                                         "deaths"="red"))+
  labs(y = "Total Persons", x = "Date", color = "")+
  theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)))+
  ggtitle("Texas COVID-19 Cases")
  ggsave("./../imgs/texas_covid_cases_total.png")
  # scale_y_log10()
         # labs(y = "Total Persons", x = "Date"))

# )
# 
# plt <- ggplot(coi_downsampled,
#        aes(x=date, y=get(interested_cols[i]), group=country_region_code,
#                             color=country_region_code))+
#   geom_point(aes(y=rollmean(get(interested_cols[i]), k=10, na.pad=TRUE)), size=.5)+
#   geom_line(aes(y=rollmean(get(interested_cols[i]), k=10, na.pad=TRUE)))+
#   # geom_point(size=.5)+geom_line()+
#   labs(y = col_labels[i], x = "Date", color = "Country")+
#   theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)),
#         axis.title.y = element_text(size = 8))
#   save_path <- paste(c("./../imgs/", interested_cols[i], ".png"), collapse = "")
#   ggsave(save_path)
#   print(plt)
```

```{r}
tx_county_data
```


```{r}
subset_census

tx_state <- map_data("state") %>% subset(region == "texas")
tx_county_map_data <- map_data("county") %>% subset(region == "texas")

tx_state
tx_county_map_data

gcounty <- map_data("county")
# 
# fipstab <-
#       transmute(maps::county.fips, fips, county = sub(":.*", "", polyname)) %>%
#       unique() %>%
#       separate(county, c("region", "subregion"), sep = ",")

# Format with subregions
fipstab <-
  transmute(maps::county.fips, fips, county = sub(":.*", "", polyname)) %>%
  unique() %>%
  separate(county, c("region", "subregion"), sep = ",")
  
  # # Combine in desired order (NA for missing)
  # gcounty <- left_join(gcounty, fipstab, c("region", "subregion"))


tx_geo_data <- left_join(gcounty, fipstab, c("region", "subregion")) %>%
  left_join(subset_census, c("fips" = "county_fips_code")) %>% filter(region == "texas") %>%
  unique()

tx_geo_data$dinfect_pcls <- cut(100 * percent_rank(tx_geo_data$pct_infected), seq(0, 100, len = 11),
                        include.lowest = TRUE)
tx_geo_data$deaths_pcls <- cut(100 * percent_rank(tx_geo_data$pct_deaths), seq(0, 100, len = 11),
                        include.lowest = TRUE)

tx_geo_data



# 
# # Lowercase
# tx_county_data$county_name <- tolower(tx_county_data$county_name)
# 
# # Remove ' county'
# tx_county_data$county_name <- sub("\\s*county\\b.*", "", tx_county_data$county_name)
# tx_county_data
# 
# # Get max confirmed cases and deaths
# tx_county_data
# 
# 
# # # Join the data with state geo info
# # tx_geo_data <- left_join(tx_county_map_data, tx_county_data, by = c("subregion" = "county_name"))
# # tx_geo_data
```
```{r}

mycolors <- colorRampPalette(brewer.pal(9, "Reds"))(11)

ggplot(tx_geo_data)+
  coord_map() + ggthemes::theme_map()+
  geom_polygon(aes(long, lat, group = group, fill=deaths))+
  # scale_fill_manual(values = mycolors, na.value = "grey") +
  # theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)),
  #       legend.background = element_rect(fill = NA), 
  #       legend.position = "left")+
  labs(fill="Percentil") + ggtitle("graphic_title")



# plt <- ggplot(gcounty_pop) +
#       geom_polygon(aes(long, lat, group = group, fill = dinfect_pcls),
#                    color = "grey", size = 0.1) +
#       geom_polygon(aes(long, lat, group = group),
#                    fill = NA, data = gusa, color = "lightgrey") +
#       coord_map("bonne", parameters = 41.6) + ggthemes::theme_map() +
#       scale_fill_manual(values = mycolors, na.value = "grey") +
#       # scale_fill_brewer(palette = "viridis", na.value = "grey") +
#       theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)),
#             legend.background = element_rect(fill = NA), 
#             legend.position = "left")

# ggplot(tx_geo_data) +
#          coord_map() + 
#   ggthemes::theme_map()+
#   geom_polygon(aes(long, lat, group = subregion, fill = confirmed_cases))
# pc_cont_iowa <- geom_polygon(aes(long, lat, group = group, fill = pchange),
#                              color = "grey", size = 0.2)
```

```{r}
# dallas_data <- tx_county_data %>% filter(county_name == "dallas") 
dallas_data <- tx_county_data %>% filter(county_name == "dallas") %>% 
  select(date, confirmed_cases, deaths)

ggplot(dallas_data %>% pivot_longer(!date, names_to = "type", values_to = "count"),
       aes(x=date, y=count, group=type, color = type))+
         geom_line()+
  # scale_y_continuous(trans = "log10", labels = trans_breaks("log10", math_format(10^.x)))+
  # scale_y_log10(breaks=c(100, 300, 500, 1000, 3000, 5000, 10000, 30000, 50000, 100000, 300000),
  #               labels=c('100', '300', '500', '1000', '3000', '5000', '10000', '30000', '50000', '100000', '300000'))+
  scale_color_manual(labels = c("Confirmed Cases", "Deaths"), values = c("confirmed_cases"="blue",
                                                                         "deaths"="red"))+
  labs(y = "Total Persons", x = "Date", color = "")+
  theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)))+
  ggtitle("Dallas COVID-19 Cases")
  # ggsave("./../imgs/dallas_covid_cases_total.png")
```


```{r}
sgcounty <- map_data("county")

# Format with subregions
fipstab <-
  transmute(maps::county.fips, fips, county = sub(":.*", "", polyname)) %>%
  unique() %>%
  separate(county, c("region", "subregion"), sep = ",")

us_geo_data <- left_join(gcounty, fipstab, c("region", "subregion")) %>%
  left_join(subset_census, c("fips" = "county_fips_code")) %>%
  unique() %>% select(long, lat, region, subregion, group, confirmed_cases, deaths, median_income, 
                      male_pop, female_pop, total_pop, median_age,
                      worked_at_home)
# us_geo_data

# Get the total cases by state
state_data_breakdown <- us_geo_data %>% 
  drop_na() %>% select(region, subregion, confirmed_cases, deaths, median_income, 
                      male_pop, female_pop, total_pop, median_age,
                      worked_at_home) %>% unique() %>%
  group_by(region) %>% select_if(is.numeric) %>%
  summarise_all(sum)

state_data_breakdown$pct_deaths <- state_data_breakdown$deaths/state_data_breakdown$total_pop
state_data_breakdown$pct_infect <- state_data_breakdown$confirmed_cases/state_data_breakdown$total_pop
state_data_breakdown$death_rate <- state_data_breakdown$deaths/state_data_breakdown$confirmed_cases

print(state_data_breakdown %>% arrange(death_rate, decreasing=T))

state_geo_data <- us_geo_data %>% select(long, lat, region, group) %>% left_join(state_data_breakdown,
                                                        by = "region")

state_geo_data

# ggplot(tx_by_day_based_on_county %>% pivot_longer(!date, names_to = "type", values_to = "count"),
#        aes(x=date, y=count, group=type, color = type))+
#          geom_line()+
#   scale_y_continuous(trans = "log10",
#                      labels = trans_breaks('log10', math_format(10^.x)))+
#   scale_color_manual(labels = c("Confirmed Cases", "Deaths"), values = c("confirmed_cases"="blue",
#                                                                          "deaths"="red"))+
#   labs(y = "Total Persons", x = "Date", color = "")+
#   theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)))+
#   ggtitle("US COVID-19 Cases")
  # ggsave("./../imgs/texas_covid_cases_total.png")
```

```{r}
# state_geo_data$state_group <- factor
state_geo_data$state_group <- factor(state_geo_data$region) 
state_geo_data$state_group <- as.numeric(state_geo_data$state_group)
state_geo_data
```

```{r}
# cf <- coord_fixed()
# cf$default <- TRUE
# state_geo_data$state_group <- 
plt <- ggplot(state_geo_data)+
  # geom_polygon(data=subset(map_data("state")), aes(x=long, y=lat, group=group, fill=death_rate))+
  geom_polygon(aes(long, lat, group = group, fill=death_rate))+
  # scale_fill_manual(values = mycolors, na.value = "grey") +
  # theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)),
  #       legend.background = element_rect(fill = NA), 
  #       legend.position = "left")+
  
  # geom_polygon(aes(long, lat, group = group, fill = get(col_val)),
  #                  color = "grey", size = 0.1, name="Percent Infected") +
      geom_polygon(aes(long, lat, group = group),
                   fill = NA, data = map_data("state"), color = "lightgrey") +
      coord_map("bonne", parameters = 41.6) + ggthemes::theme_map()
  
  # labs(fill="Percentil") + ggtitle("graphic_title")+
  # coord_map("bonne", parameters = 41.6) + ggthemes::theme_map()
plt
```


```{r}
# http://homepage.stat.uiowa.edu/~luke/classes/STAT4580-2020/maps.html
ggplot(state_geo_data)+
  # geom_polygon(data=subset(map_data("state")), aes(x=long, y=lat, group=group, fill=death_rate))+
  geom_polygon(aes(long, lat, group = group, fill=death_rate))+
  # scale_fill_manual(values = mycolors, na.value = "grey") +
  # theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)),
  #       legend.background = element_rect(fill = NA), 
  #       legend.position = "left")+
  
  # geom_polygon(aes(long, lat, group = group, fill = get(col_val)),
  #                  color = "grey", size = 0.1, name="Percent Infected") +
      geom_polygon(aes(long, lat, group = group),
                   fill = NA, data = map_data("state"), color = "lightgrey") +
      coord_map("bonne", parameters = 41.6) + ggthemes::theme_map()+
  
  labs(fill="Percentil") + ggtitle("graphic_title")+
  coord_map("bonne", parameters = 41.6) + ggthemes::theme_map()
```

```{r}
gusa_pop <- state_geo_data
```













